COD_week10_2_MGK_BTE3207

Minsik Kim

2024-11-06

Multiple linear regression

Before begin..

Let’s load the SBP dataset.

dataset_sbp <- read.csv(file = "Inha/5_Lectures/2024/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")



head(dataset_sbp)
##   SEX BTH_G SBP DBP FBS DIS  BMI
## 1   1     1 116  78  94   4 16.6
## 2   1     1 100  60  79   4 22.3
## 3   1     1 100  60  87   4 21.9
## 4   1     1 111  70  72   4 20.2
## 5   1     1 120  80  98   4 20.0
## 6   1     1 115  79  95   4 23.1

Making a new variable hypertension

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   T,
                                   F)


dataset_sbp$history_of_hypertension <- ifelse(dataset_sbp$DIS == 1 |
                                           dataset_sbp$DIS == 2,
                                   T,
                                   F)

set.seed(1)

dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))

dataset_sbp_small$DIS <- as.factor(dataset_sbp_small$DIS)

Simple linear regression

With lm() function, we can calculate the linear model of DBP~SBP with the least squares algorithm.

#A function that returns equation

lm_eqn <- function(m){
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}




ggplot(data = dataset_sbp_small, aes(y = SBP, x = DBP)) +
        geom_point() +
        theme_classic(base_family = "serif", base_size = 20) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        geom_smooth(method = "lm") +
        geom_label(y = 170, x = 68,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

lm(data = dataset_sbp_small, SBP ~ DBP) %>% confint()
##                2.5 %    97.5 %
## (Intercept) 32.20308 41.168723
## DBP          1.05807  1.175343
#rnorm(100, mean = 100, sd = 1)

Multiple regression

Basic concept

Deviding samples

In fact, there are two SEXs in the sample.

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top") +
        guides(col=guide_legend(title = "History of hyper tension"))

Deviding samples

In fact, there are two SEXs in the sample.

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") +
        facet_wrap(~history_of_hypertension) +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) +
        geom_abline(intercept = 37, slope = 1.1, col = "blue") +
        ggtitle("y = 37 + 1.1 x")

# Multiple linear regression

lm(data = dataset_sbp_small, SBP ~ DBP) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.723  -6.022  -1.180   5.145  46.312 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.68590    2.28442   16.06   <2e-16 ***
## DBP          1.11671    0.02988   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.487 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16
lm(data = dataset_sbp_small, SBP ~ DBP + history_of_hypertension) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.590  -4.500  -0.984   5.702  40.255 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40.40433    2.23748  18.058   <2e-16 ***
## DBP                          1.04867    0.02974  35.256   <2e-16 ***
## history_of_hypertensionTRUE  6.42067    0.71630   8.964   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared:  0.6143, Adjusted R-squared:  0.6135 
## F-statistic:   794 on 2 and 997 DF,  p-value: < 2.2e-16
palette()
## [1] "black"   "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top") +
        guides(col=guide_legend(title = "History of hyper tension")) +
        geom_abline(intercept = 40, slope = 1.04, col = "#F8766D") +
        annotate(geom="text", x=120, y=140, label="LM of HT-False",
              color="#F8766D", family = "serif", size = 8) + 
        geom_abline(intercept = 46.42, slope = 1.04, col = "#00BFC4") +
        annotate(geom="text", x=100, y=170, label="LM of HT-True",
              color="#00BFC4", family = "serif", size = 8)

Continuous variable as covariate

Deviding samples

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) + 
        geom_label(y = 170, x = 68,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

ggplot(dataset_sbp_small, aes(y = SBP, x = BTH_G)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        ylab("SBP (mmHg)") +
        xlab("Age group") +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) + 
        geom_label(y = 170, x = 18,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ BTH_G)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

lm(data = dataset_sbp, formula = SBP ~ DBP)
## 
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp)
## 
## Coefficients:
## (Intercept)          DBP  
##      38.144        1.105
model_result <- lm(data = dataset_sbp, formula = SBP ~ DBP)

dataset_sbp$DIS <- as.factor(dataset_sbp$DIS)
dataset_sbp %>% summary
##       SEX           BTH_G            SBP             DBP        
##  Min.   :1.00   Min.   : 1.00   Min.   : 82.0   Min.   : 50.00  
##  1st Qu.:1.00   1st Qu.: 9.00   1st Qu.:110.0   1st Qu.: 70.00  
##  Median :1.00   Median :14.00   Median :120.0   Median : 76.00  
##  Mean   :1.49   Mean   :13.91   Mean   :121.9   Mean   : 75.79  
##  3rd Qu.:2.00   3rd Qu.:19.00   3rd Qu.:130.0   3rd Qu.: 80.00  
##  Max.   :2.00   Max.   :27.00   Max.   :190.0   Max.   :120.00  
##       FBS         DIS             BMI       hypertension   
##  Min.   : 60.00   1: 53398   Min.   :14.8   Mode :logical  
##  1st Qu.: 87.00   2:162826   1st Qu.:21.5   FALSE:683197   
##  Median : 94.00   3: 43114   Median :23.6   TRUE :316803   
##  Mean   : 98.86   4:740662   Mean   :23.8                  
##  3rd Qu.:104.00              3rd Qu.:25.8                  
##  Max.   :358.00              Max.   :40.3                  
##  history_of_hypertension
##  Mode :logical          
##  FALSE:783776           
##  TRUE :216224           
##                         
##                         
## 
dataset_sbp %>% 
        lm(formula = SBP ~ DBP + DIS + DBP * DIS) %>%
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + DIS + DBP * DIS, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.292  -4.998  -0.600   5.474  89.299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.478249   0.325920 170.220  < 2e-16 ***
## DBP           0.958139   0.004126 232.202  < 2e-16 ***
## DIS2         -4.460697   0.375896 -11.867  < 2e-16 ***
## DIS3         -7.903914   0.499333 -15.829  < 2e-16 ***
## DIS4        -16.772084   0.337319 -49.722  < 2e-16 ***
## DBP:DIS2      0.035531   0.004735   7.504 6.17e-14 ***
## DBP:DIS3      0.042764   0.006454   6.626 3.45e-11 ***
## DBP:DIS4      0.120510   0.004285  28.124  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.415 on 999992 degrees of freedom
## Multiple R-squared:  0.5819, Adjusted R-squared:  0.5819 
## F-statistic: 1.989e+05 on 7 and 999992 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + BTH_G, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + BTH_G, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.618  -5.813  -0.839   5.478  44.670 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.41804    2.18271   15.77   <2e-16 ***
## DBP          1.06715    0.02881   37.05   <2e-16 ***
## BTH_G        0.43020    0.04153   10.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.019 on 997 degrees of freedom
## Multiple R-squared:  0.6237, Adjusted R-squared:  0.623 
## F-statistic: 826.4 on 2 and 997 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + SEX, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + SEX, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.063  -5.970  -1.219   5.211  45.634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.03083    2.63553  14.809   <2e-16 ***
## DBP          1.10698    0.03035  36.480   <2e-16 ***
## SEX         -1.08426    0.60970  -1.778   0.0757 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.477 on 997 degrees of freedom
## Multiple R-squared:  0.5846, Adjusted R-squared:  0.5837 
## F-statistic: 701.4 on 2 and 997 DF,  p-value: < 2.2e-16
car::avPlots(dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .))

car::avPlots(dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .))
dataset_sbp_small %>%
        lm(SBP ~ SEX, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ SEX, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.779  -8.779   0.221  10.221  55.221 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 128.8716     1.4333   89.91  < 2e-16 ***
## SEX          -5.0921     0.9159   -5.56 3.47e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.47 on 998 degrees of freedom
## Multiple R-squared:  0.03004,    Adjusted R-squared:  0.02907 
## F-statistic: 30.91 on 1 and 998 DF,  p-value: 3.47e-08
dataset_sbp_small %>%
        lm(SBP ~ history_of_hypertension, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ history_of_hypertension, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.304  -8.438   0.562   9.913  51.562 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 118.4381     0.4911   241.2   <2e-16 ***
## history_of_hypertensionTRUE  12.8654     1.0376    12.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.68 on 998 degrees of freedom
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.1326 
## F-statistic: 153.7 on 1 and 998 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.590  -4.500  -0.984   5.702  40.255 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40.40433    2.23748  18.058   <2e-16 ***
## DBP                          1.04867    0.02974  35.256   <2e-16 ***
## history_of_hypertensionTRUE  6.42067    0.71630   8.964   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared:  0.6143, Adjusted R-squared:  0.6135 
## F-statistic:   794 on 2 and 997 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .) %>%
        car::avPlots()

dataset_sbp_small %>%
        ggplot(., aes(y = SBP, x = history_of_hypertension)) +
        geom_jitter()

Multiple linear regression

lm(data = dataset_sbp_small, SBP ~ DBP) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.723  -6.022  -1.180   5.145  46.312 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.68590    2.28442   16.06   <2e-16 ***
## DBP          1.11671    0.02988   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.487 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16
lm(data = dataset_sbp_small, SBP ~ DBP + history_of_hypertension) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.590  -4.500  -0.984   5.702  40.255 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40.40433    2.23748  18.058   <2e-16 ***
## DBP                          1.04867    0.02974  35.256   <2e-16 ***
## history_of_hypertensionTRUE  6.42067    0.71630   8.964   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared:  0.6143, Adjusted R-squared:  0.6135 
## F-statistic:   794 on 2 and 997 DF,  p-value: < 2.2e-16
palette()
## [1] "black"   "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top") +
        guides(col=guide_legend(title = "History of hyper tension")) +
        geom_abline(intercept = 40, slope = 1.04, col = "#F8766D") +
        annotate(geom="text", x=120, y=140, label="LM of HT-False",
              color="#F8766D", family = "serif", size = 8) + 
        geom_abline(intercept = 46.42, slope = 1.04, col = "#00BFC4") +
        annotate(geom="text", x=100, y=170, label="LM of HT-True",
              color="#00BFC4", family = "serif", size = 8)

Simulation

Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.